Purpose

The purpose of this document is to reproduce the pipeline used to score genes on the basis of their expression in different Plasmodium species and life cycle stages, with the goal of selecting liver stage candidates.

Set up

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
library(ggplot2)
library(scales)
library(ggrepel)
library(ggpubr)
library(patchwork)
library(openxlsx)

source('~/R projects/scriptsCommon/colorworks.R')

Integrate the datasets

Gather the data

RNA-seq data for different species and life cycle stages was retrieved from PlasmoDB as separate datasets. They need to be integrated before they can be used. For some datasets with multiple replicates, the expression of a gene is summarized as the mean or the geometric mean of all values.

geomean <- function(x){exp(mean(log(x)))}

P. cynomolgi datasets

  • Pcyn 100 days blood
pcyn100 <- read.delim('Data/PlasmoDB/Pcyn_100daysBloodRNAseq.txt')
pcyn100 <- pcyn100[,(!grepl('antisense', colnames(pcyn100)))]
pcyn100 <- pcyn100[,!(grepl('sense', colnames(pcyn100)) & ! grepl('unique.only', colnames(pcyn100)))]
pcyn100 %<>% select(-X)
pcyn100 <- pcyn100[,!(grepl('sense', colnames(pcyn100)) & !grepl('WB.+TP2', colnames(pcyn100)))] #TP2 corresponds to acute parasitemia, WB is for whole blood
colnames(pcyn100) <- gsub('(.+\\.\\.\\.WB_)(.+)(\\.TP2\\.\\.\\.unique.only)','\\2',colnames(pcyn100))
pcyn100 %<>% rename('P.cyn.Gene.ID'='Gene.ID')
head(pcyn100)
##   P.cyn.Gene.ID          source_id Gene.Name.or.Symbol gene_source_id
## 1  PcyM_0000100 PcyM_0000100-t36_1                 N/A   PcyM_0000100
## 2  PcyM_0000200 PcyM_0000200-t36_1                 N/A   PcyM_0000200
## 3  PcyM_0000300 PcyM_0000300-t36_1                 N/A   PcyM_0000300
## 4  PcyM_0000350 PcyM_0000350-t36_1                 N/A   PcyM_0000350
## 5  PcyM_0000400 PcyM_0000400-t36_1                 N/A   PcyM_0000400
## 6  PcyM_0000500 PcyM_0000500-t36_1                 N/A   PcyM_0000500
##   Mmul_RIc14.Day21 Mmul_RSb14.Day21 Mmul_RMe14.Day20 Mmul_RFa14.Day18
## 1             2.19             0.32             1.13             0.54
## 2             0.00             0.00             0.07             0.06
## 3            11.17            14.62            13.65            12.63
## 4             0.44             0.26             0.40             0.40
## 5             8.68             7.66             9.44            11.45
## 6             7.77             9.07             8.41             9.29
  • Join with P. falciparum orthology table
pcyn_ort <- read.delim('Data/PlasmoDB/PcynPfal_SyntenicOrthologs.txt')

pcyn100 <- merge(pcyn100, pcyn_ort, by.x='P.cyn.Gene.ID', by.y='Input.Ortholog.s.', all.x=TRUE, all.y=FALSE)
pcyn100 %<>% 
  filter(!is.na(Gene.ID)) %>%
  select(-c(source_id.x, source_id.y, X, Gene.Name.or.Symbol.x)) %>% 
  rename('Gene.Name.or.Symbol'='Gene.Name.or.Symbol.y')

pcyn100 %<>% rowwise() %>% mutate(PcynB.mean=mean(c_across(where(is.numeric))),
                                  PcynB.max=max(c_across(where(is.numeric)))) %>%
  ungroup()

pcyn100 %>% slice_sample(n=5)
## # A tibble: 5 × 11
##   P.cyn.Gene.ID gene_source_id Mmul_RIc14.Day21 Mmul_RSb14.Day21
##   <chr>         <chr>                     <dbl>            <dbl>
## 1 PcyM_0621300  PcyM_0621300               45.7             50.9
## 2 PcyM_1305300  PcyM_1305300               64.0            143. 
## 3 PcyM_0410700  PcyM_0410700              333.             261. 
## 4 PcyM_1445000  PcyM_1445000              159.             157. 
## 5 PcyM_1443300  PcyM_1443300               46.5             49  
## # ℹ 7 more variables: Mmul_RMe14.Day20 <dbl>, Mmul_RFa14.Day18 <dbl>,
## #   Gene.ID <chr>, Organism <chr>, Gene.Name.or.Symbol <chr>, PcynB.mean <dbl>,
## #   PcynB.max <dbl>
  • P. cynomolgi liver stages dataset and orthology ####
pcyn <- read.table('Data/PlasmoDB/Cubi_PcynNormalizedCounts.txt', sep='\t', header=TRUE)
colnames(pcyn)[5:7] <- c('P.cyn.Liver.Schizont','P.cyn.Blood','P.cyn.Hypnozoite')
pcyn %<>% select(-c(2,3,8))
pcyn %<>% rename(P.cyn.Gene.ID=Gene.ID, P.cyn.Symbol=Gene.Name.or.Symbol, PcynB.liverDataset=P.cyn.Blood)

pcyn %<>% select(c('P.cyn.Gene.ID','P.cyn.Liver.Schizont','PcynB.liverDataset')) %>%
  right_join(pcyn_ort %>% select(Input.Ortholog.s., Gene.ID, Gene.Name.or.Symbol),
              by=c('P.cyn.Gene.ID'='Input.Ortholog.s.')) #I'm keeping the data only for Pc genes that have syntenic orthologs in Pf

pcyn %<>% distinct(.keep_all = T) 
pcyn %>% slice_sample(n=5)
##   P.cyn.Gene.ID P.cyn.Liver.Schizont PcynB.liverDataset       Gene.ID
## 1  PcyM_1454100               243.82             128.85 PF3D7_1231500
## 2  PcyM_0946100                78.13              63.77 PF3D7_1141400
## 3  PcyM_1019400               102.38              73.65 PF3D7_0513400
## 4  PcyM_0720100               139.51              57.03 PF3D7_0921300
## 5  PcyM_1270100               453.02             385.22 PF3D7_1445700
##   Gene.Name.or.Symbol
## 1                 N/A
## 2                PIGH
## 3               YihA1
## 4                 N/A
## 5                 N/A
  • Gather all P. cynomolgi data
pcyn %<>%
  left_join(pcyn100[,c(1,3:6,10,11)],
            by='P.cyn.Gene.ID')
## Warning in left_join(., pcyn100[, c(1, 3:6, 10, 11)], by = "P.cyn.Gene.ID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 168 of `x` matches multiple rows in `y`.
## ℹ Row 462 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
pcyn %>% slice_sample(n=5)
##               P.cyn.Gene.ID P.cyn.Liver.Schizont PcynB.liverDataset
## 1              PcyM_1029800                15.67             288.27
## 2              PcyM_0213000                25.14              32.04
## 3 PcyM_0306400,PcyM_0306500                   NA                 NA
## 4              PcyM_0734300                94.50              20.22
## 5              PcyM_1140000               112.70              69.64
##         Gene.ID Gene.Name.or.Symbol Mmul_RIc14.Day21 Mmul_RSb14.Day21
## 1 PF3D7_0504300                 N/A            42.08            44.52
## 2 PF3D7_0104700                MFR2             3.57             4.40
## 3 PF3D7_0404400                 P36               NA               NA
## 4 PF3D7_0932800                CSE1           137.12           136.47
## 5 PF3D7_0611400                SWIB            53.03            57.71
##   Mmul_RMe14.Day20 Mmul_RFa14.Day18 PcynB.mean PcynB.max
## 1            64.33            44.17    48.7750     64.33
## 2             8.19             7.74     5.9750      8.19
## 3               NA               NA         NA        NA
## 4           123.76           123.57   130.2300    137.12
## 5            51.77            54.06    54.1425     57.71
  • The rank of gene expression for the blood stage of P. cynomolgi is strongly correlated between the two datasets available, especially for genes with very high and very low expression (which are the most interesting for our search). No
pcyn %>%
  mutate_at(.vars=c('PcynB.liverDataset','PcynB.mean'), .funs=rank) %>%
  ggplot(aes(x=PcynB.liverDataset, y=PcynB.mean))+
  geom_density2d_filled(show.legend=FALSE)+
  geom_point(size=0.05, color='white', alpha=0.2)+
  stat_cor(method='spearman', color='white')+
  scale_fill_viridis_d(option='mako')+
  scale_x_continuous(expand=expansion(mult=0))+
  scale_y_continuous(expand=expansion(mult=0))+
  theme_classic()+theme(aspect.ratio=1)

  • Define PcL and PcD as the log2-transformed fold change and the difference between liver and blood stage expression for each gene
offst <- 0.001
pcyn %<>%  mutate(PcL=log2( (P.cyn.Liver.Schizont+offst)/(PcynB.mean+offst) ),
                  PcD=P.cyn.Liver.Schizont-PcynB.mean)

pcyn %>%
  slice_sample(n=5)
##   P.cyn.Gene.ID P.cyn.Liver.Schizont PcynB.liverDataset       Gene.ID
## 1  PcyM_0414800                81.45              22.09 PF3D7_0208600
## 2  PcyM_1104800              2167.36            1441.75 PF3D7_1366500
## 3  PcyM_1425900               153.63             102.80 PF3D7_0715700
## 4  PcyM_0911100                 4.95              23.75 PF3D7_1109100
## 5  PcyM_0941800                97.22              39.68 PF3D7_1136800
##   Gene.Name.or.Symbol Mmul_RIc14.Day21 Mmul_RSb14.Day21 Mmul_RMe14.Day20
## 1                RRF1            23.43            25.68            19.11
## 2                 NDK           268.81           263.05           382.45
## 3                 N/A            22.29            25.76            23.79
## 4                 N/A           113.96           122.07            78.96
## 5                 N/A            14.52            16.87            15.64
##   Mmul_RFa14.Day18 PcynB.mean PcynB.max       PcL       PcD
## 1            20.38    22.1500     25.68  1.878561   59.3000
## 2           234.95   287.3150    382.45  2.915229 1880.0450
## 3            17.41    22.3125     25.76  2.783481  131.3175
## 4            22.69    84.4200    122.07 -4.091810  -79.4700
## 5            16.02    15.7625     16.87  2.624680   81.4575
  • Visualize distribution of log2 of gene expression between liver and blood stage
pcyn %>% ggplot(aes(x=PcL))+stat_density()+
  geom_boxplot(mapping=aes(y=-1e-2), width=0.01)+
  ggtitle(expr(italic('P. cynomolgi')))+
  theme_classic()
## Warning: Removed 59 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 59 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

P. vivax datasets

  • Liver stage dataset
pviv <- read.delim('Data/PlasmoDB/Gural_PvivNormalizedCounts.txt', sep='\t', header=TRUE)
colnames(pviv)[c(4,5)] <- c('P.viv.Mixed1','P.viv.Mixed2')
pviv %<>% rename(P.viv.Gene.ID=Gene.ID) %>% select(-c(source_id, X))

pviv_ort <- read.delim('Data/PlasmoDB/PvivPfal_SyntenicOrthologs.txt', sep='\t', header=TRUE)
pviv_ort %<>% select(-c(source_id, X)) 
pviv_ort %<>% rename(P.viv.Gene.ID=Input.Ortholog.s.)

pviv %<>%
  right_join(pviv_ort, by='P.viv.Gene.ID') %>%
  distinct() %>% 
  distinct(.keep_all=TRUE)

# Summarize normalized counts for replicates as the mean
pviv %<>% 
  rowwise() %>%
  mutate(Pviv.Lmean=mean(c(P.viv.Mixed1,P.viv.Mixed2))) %>%
  ungroup()

pviv %>%
  slice_sample(n=5) 
## # A tibble: 5 × 9
##   P.viv.Gene.ID gene_source_id.x P.viv.Mixed1 P.viv.Mixed2 Gene.ID      
##   <chr>         <chr>                   <dbl>        <dbl> <chr>        
## 1 PVP01_1213600 PVP01_1213600          720.         507.   PF3D7_1341200
## 2 PVP01_0213500 PVP01_0213500           83.1         63.2  PF3D7_0727800
## 3 PVP01_0725400 PVP01_0725400           44.0         46.6  PF3D7_0926900
## 4 PVP01_1434600 PVP01_1434600            1.25         2.48 PF3D7_1215800
## 5 PVP01_0935200 PVP01_0935200           83.0         78.9  PF3D7_1134400
## # ℹ 4 more variables: Product.Description <chr>, gene_source_id.y <chr>,
## #   Gene.Name.or.Symbol <chr>, Pviv.Lmean <dbl>
  • IED cycle dataset
#Pviv IED cycle ####
pviv.B <- read.delim('Data/PlasmoDB/Pviv_IEDCycle.txt') %>% select(-X)
colnames(pviv.B) <- gsub('(smru.+\\.\\.\\.)(smru.+)(\\.\\.\\..+)', '\\2', colnames(pviv.B))
pviv.B %<>% rename(P.viv.Gene.ID=Gene.ID)

# Get summary values for blood stage gene expression (max and mean)
pviv.B %<>% rowwise() %>% mutate(Pviv.Bmax=max(c_across(where(is.numeric))),
                                 Pviv.Bmean=mean(c_across(where(is.numeric)))) %>%
  ungroup()

pviv %<>%
  left_join(pviv.B %>% 
              select(c('P.viv.Gene.ID', 'Pviv.Bmax', 'Pviv.Bmean')))
## Joining with `by = join_by(P.viv.Gene.ID)`
  • Define PvL and PvD as the log2-transformed fold change and the difference between liver and blood stage expression for each gene
pviv %<>% rowwise() %>% mutate(PvL=log2((Pviv.Lmean+offst)/(Pviv.Bmax+offst)),
                               PvD=Pviv.Lmean-Pviv.Bmax) %>%
  ungroup()

pviv %>% slice_sample(n=5)
## # A tibble: 5 × 13
##   P.viv.Gene.ID gene_source_id.x P.viv.Mixed1 P.viv.Mixed2 Gene.ID      
##   <chr>         <chr>                   <dbl>        <dbl> <chr>        
## 1 PVP01_1238400 PVP01_1238400            54.9       120.   PF3D7_1468400
## 2 PVP01_0505700 PVP01_0505700            10.3        17.1  PF3D7_0828700
## 3 PVP01_0927200 PVP01_0927200            11.0         6.52 PF3D7_1126500
## 4 PVP01_1423400 PVP01_1423400           161.        141.   PF3D7_0714300
## 5 PVP01_1308300 PVP01_1308300           202.        168.   PF3D7_1209200
## # ℹ 8 more variables: Product.Description <chr>, gene_source_id.y <chr>,
## #   Gene.Name.or.Symbol <chr>, Pviv.Lmean <dbl>, Pviv.Bmax <dbl>,
## #   Pviv.Bmean <dbl>, PvL <dbl>, PvD <dbl>
  • Visualize
pviv %>% ggplot(aes(x=PvL))+stat_density()+
  geom_boxplot(mapping=aes(y=-1e-2), width=0.01)+
  ggtitle(expr(italic('P. vivax')))+
  theme_classic()
## Warning: Removed 60 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 60 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

P. falciparum datasets

Seven stages (López-Barragán)
sevenstg <-  read.delim('Data/PlasmoDB/LopezBarragan_Pf7stagesNormalizedCounts.txt')
colnames(sevenstg)[6:12] <- gsub('(P..falciparum.Su.Seven.Stages.RNA.Seq.data...)(.+)(...unique..3D7.7Stages.RNA.Seq.)', '\\2', colnames(sevenstg[6:12]))
colnames(sevenstg) <- gsub(' ', '\\.', colnames(sevenstg))
sevenstg %<>% select(-c(source_id, Product.Description, gene_source_id, Ookinete, X))
sevenstg %<>% distinct(.keep_all=TRUE)

#Summarize gene expression in Pf blood stages as the max expression with exclusion of gametocytes
sevenstg %<>% 
  rowwise() %>% 
  mutate(seven.max=max(c_across(where(is.numeric))),
         seven.max.excl.gam=max(Ring,Early.Trophozoite,Late.Trophozoite,Schizont))

sevenstg %>% 
  slice_sample(n=5)
## # A tibble: 5,460 × 10
## # Rowwise: 
##    Gene.ID Gene.Name.or.Symbol  Ring Early.Trophozoite Late.Trophozoite Schizont
##    <chr>   <chr>               <dbl>             <dbl>            <dbl>    <dbl>
##  1 PF3D7_… VAR                 32.2               4.01             3.8      1.62
##  2 PF3D7_… RIF                  5.97              0.77             2.14     0   
##  3 PF3D7_… VAR                  2.13              0.15             0.06     0.16
##  4 PF3D7_… RIF                  4.56              4.79            15.9      1.11
##  5 PF3D7_… N/A                  1.52              0                2.23     0   
##  6 PF3D7_… RIF                  2.52              0.14             0.46     0   
##  7 PF3D7_… N/A                  0                 0                0        0   
##  8 PF3D7_… RIF                  1.31              0                0        0   
##  9 PF3D7_… RIF                  0.48              2.18             0.36     0   
## 10 PF3D7_… RIF                  2.68              0                0.12     0   
## # ℹ 5,450 more rows
## # ℹ 4 more variables: Gametocyte.II <dbl>, Gametocyte.V <dbl>, seven.max <dbl>,
## #   seven.max.excl.gam <dbl>
Bartfai time series
idc.B <- read.xlsx('Data/PlasmoDB/Bartfai_PfIDCtimepoints.xlsx')
colnames(idc.B) <- gsub('(sense...)(.+)(...unique.+)', '\\2', colnames(idc.B))

# Summarize as mean, max, and cumulative expression for each gene
idc.B %<>% rowwise() %>% mutate(idcB.max=max(c_across(where(is.numeric))),
                                idcB.mean=mean(c_across(where(is.numeric))),
                                idcB.cumul=sum(c_across(starts_with('T')))
) %>%
  ungroup()

idc.B %>% 
  slice_sample(n=5)
## # A tibble: 5 × 16
##   Gene.ID       source_id Product.Description gene_source_id Gene.Name.or.Symbol
##   <chr>         <chr>     <chr>               <chr>          <chr>              
## 1 PF3D7_1111500 PF3D7_11… acylphosphatase, p… PF3D7_1111500  N/A                
## 2 PF3D7_0817500 PF3D7_08… histidine triad nu… PF3D7_0817500  HINT1              
## 3 PF3D7_0710600 PF3D7_07… 60S ribosomal prot… PF3D7_0710600  RPL34              
## 4 PF3D7_1435700 PF3D7_14… ataxin-2 like prot… PF3D7_1435700  N/A                
## 5 PF3D7_0532600 PF3D7_05… Plasmodium exporte… PF3D7_0532600  N/A                
## # ℹ 11 more variables: T40 <dbl>, T35 <dbl>, T25 <dbl>, T30 <dbl>, T15 <dbl>,
## #   T20 <dbl>, T05 <dbl>, T10 <dbl>, idcB.max <dbl>, idcB.mean <dbl>,
## #   idcB.cumul <dbl>

F1: Discrepant expression between liver and blood

In order to compute the divergence between liver and blood stage expression within each model of infection, the expression values will be transformed into ranks.

pcyn %<>% ungroup() %>%
  mutate(PcLiverRank=rank(P.cyn.Liver.Schizont),
         PcBloodRank=rank(PcynB.mean))

pviv %<>% ungroup() %>%
  mutate(PvLiverRank=rank(Pviv.Lmean),
         PvBloodRank=rank(Pviv.Bmax)) 

LB.ranks <- inner_join(pcyn %>% select(Gene.ID, P.cyn.Gene.ID, PcLiverRank, PcBloodRank), 
                       pviv %>% select(Gene.ID, P.viv.Gene.ID, PvLiverRank, PvBloodRank),
                       by='Gene.ID'
) %>%
  mutate(PcRankDiff=abs(PcLiverRank-PcBloodRank),
         PcLr=log2(PcLiverRank/PcBloodRank),
         PvRankDiff=abs(PvLiverRank-PvBloodRank),
         PvLr=log2(PvLiverRank/PvBloodRank))
## Warning in inner_join(pcyn %>% select(Gene.ID, P.cyn.Gene.ID, PcLiverRank, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 236 of `x` matches multiple rows in `y`.
## ℹ Row 167 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
LB.ranks %>%
  slice_sample(n=5)
##         Gene.ID P.cyn.Gene.ID PcLiverRank PcBloodRank P.viv.Gene.ID PvLiverRank
## 1 PF3D7_0728400  PcyM_0217600      3735.5      2954.5 PVP01_0214100      1617.5
## 2 PF3D7_1231300  PcyM_1453900        28.0      1430.0 PVP01_1449500       399.0
## 3 PF3D7_1308500  PcyM_1410100       701.5      1263.0 PVP01_1409500      3075.0
## 4 PF3D7_1318100  PcyM_1419800      3919.0      2877.0 PVP01_1419000      3681.0
## 5 PF3D7_1135300  PcyM_0940100      2786.0      2580.0 PVP01_0936100      3297.0
##   PvBloodRank PcRankDiff       PcLr PvRankDiff       PvLr
## 1      1834.5      781.0  0.3383874      217.0 -0.1816212
## 2       348.0     1402.0 -5.6744445       51.0  0.1973014
## 3      2602.5      561.5 -0.8483396      472.5  0.2406882
## 4      2775.0     1042.0  0.4459204      906.0  0.4076100
## 5      2154.0      206.0  0.1108242     1143.0  0.6141356

Visualize

p1 <- LB.ranks %>% ggplot(aes(x=PvBloodRank, y=PvLiverRank))+
  geom_density2d_filled(show.legend = FALSE)+
  geom_point(size=0.1, alpha=0.2, color='white')+
  labs(x='Blood', y='Liver')+
  ggtitle(expression(italic('P. vivax')))+
  scale_fill_viridis_d(option='mako')+
  scale_x_continuous(expand=expansion(mult=0))+
  scale_y_continuous(expand=expansion(mult=0))+
  theme_classic()+theme(aspect.ratio=1)

p2 <- LB.ranks %>% ggplot(aes(x=PcBloodRank, y=PcLiverRank))+
  geom_density2d_filled(show.legend = FALSE)+
  geom_point(size=0.1, alpha=0.2, color='white')+
  labs(x='Blood', y='Liver')+
  ggtitle(expression(italic('P. cynomolgi')))+
  scale_fill_viridis_d(option='mako')+
  scale_x_continuous(expand=expansion(mult=0))+
  scale_y_continuous(expand=expansion(mult=0))+
  theme_classic()+theme(aspect.ratio=1)

p1+p2+plot_annotation(title='Gene expression rank')

F2: Log10-transformed cumulative expression across Pf blood stage development

For the liver stage candidates, we want the genes with high expression at any point in the blood stage to score lower. Because we have multiple datasets describing the blood stage, we summarize blood expression as the maximum rank across datasets for each gene. This will be useful for liver stage candidates selection, but might need to be tweaked for blood candidates (because it may prioritize candidates with very high expression during a very limited section of the blood stage cycle, e.g. gametocytes, which are not very abundant in circulation and are conceivably not present for a long time).

idc.B %>% 
  select(Gene.ID, Gene.Name.or.Symbol, idcB.cumul) %>%
  slice_sample(n=5)
## # A tibble: 5 × 3
##   Gene.ID       Gene.Name.or.Symbol idcB.cumul
##   <chr>         <chr>                    <dbl>
## 1 PF3D7_0920400 N/A                       164.
## 2 PF3D7_0818400 FCF1                     3622.
## 3 PF3D7_0903600 N/A                      1191.
## 4 PF3D7_1202200 MPC                       369.
## 5 PF3D7_1218600 RRS                       567.

F3: Correlated expression of each gene between matched life cycle stages across species

P. cynomolgi 100 days blood series vs. P. falciparum Bartfai time series

Correlation is calculated over the mean expression in each dataset.

PcynPfal <- merge(pcyn100 %>% 
                    select(c(Gene.ID,PcynB.mean)), 
                  idc.B %>% rowwise() %>% 
                    mutate(idcB.mean=mean(c_across(starts_with('T')))) %>%
                    select(c(Gene.ID, idcB.mean)), 
                  all.x=TRUE, all.y=FALSE) %>%
  distinct()

ct1 <- cor.test(PcynPfal$PcynB.mean, PcynPfal$idcB.mean, method='pearson')
lr1 <- MASS::rlm(PcynB.mean ~ idcB.mean, data=PcynPfal)
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
res1 <- lr1$residuals

Visualize

p1 <- ggplot(PcynPfal, aes(x=idcB.mean, y=PcynB.mean,
                           xend=idcB.mean, yend=PcynB.mean-res1))+
  geom_segment(alpha=0.2)+
  geom_abline(slope=lr1$coefficients[[2]], 
              intercept=lr1$coefficients[[1]], 
              color='orange', size=1,)+
  stat_cor(color='orange')+
  theme_minimal()+theme(aspect.ratio=1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- p1+xlim(c(0,500))+ylim(c(0,1000))+
  # theme(plot.background=element_rect(fill='white'),
  #       axis.title=element_blank())+
  ggtitle('Zoom in')+theme(aspect.ratio=1)

p3 <- ggplot(PcynPfal, 
             aes(x=idcB.mean, y=res1))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')+theme(aspect.ratio=1)

p4 <- ggplot(PcynPfal, 
             aes(x=PcynB.mean, y=res1))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')+theme(aspect.ratio=1)

p1 + p2 + p3 + p4
## Warning: The following aesthetics were dropped during statistical transformation: xend
## and yend.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 195 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: The following aesthetics were dropped during statistical transformation: xend
## and yend.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 195 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

P.falciparum 7 stages mean vs P. vivax mean

PvivPfal <- left_join(pviv %>% select(c(Gene.ID,Pviv.Bmean)), 
                      idc.B %>% select(c(Gene.ID,idcB.mean)), 
                      by='Gene.ID') %>% .[complete.cases(.), ]
## Warning in left_join(pviv %>% select(c(Gene.ID, Pviv.Bmean)), idc.B %>% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 167 of `x` matches multiple rows in `y`.
## ℹ Row 1800 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
ct2 <- cor.test(PvivPfal$Pviv.Bmean, PvivPfal$idcB.mean, method='pearson')
lr2 <- MASS::rlm(Pviv.Bmean ~ idcB.mean, data=PvivPfal)
res2 <- lr2$residuals

Visualize

p1 <- ggplot(PvivPfal, aes(x=idcB.mean, xend=idcB.mean, 
                           y=Pviv.Bmean, yend=Pviv.Bmean-res2))+
  geom_segment(alpha=0.2)+
  geom_abline(slope=lr2$coefficients[[2]], 
              intercept=lr2$coefficients[[1]], 
              color='orange', size=1)+
  stat_cor(color='orange')+
  theme_minimal()+theme(aspect.ratio=1)

p2 <- p1+xlim(c(0,500))+ylim(c(0,1000))+
  ggtitle('Close-up')+theme(aspect.ratio=1)

p3 <- ggplot(PvivPfal, aes(x=idcB.mean, y=res2))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  ggtitle('Residuals')+theme(aspect.ratio=1)

p4 <- ggplot(PvivPfal, aes(x=Pviv.Bmean, y=res2))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  ggtitle('Residuals')+theme(aspect.ratio=1)

p1 + p2 + p3 + p4
## Warning: The following aesthetics were dropped during statistical transformation: xend
## and yend.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 212 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: The following aesthetics were dropped during statistical transformation: xend
## and yend.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 212 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Sallivary gland sporozoite: P. vivax vs P. falciparum

We haven’t read in this data yet.

  • P. vivax sporozoites
PvSpz <- read.delim('Data/PlasmoDB/Jex_PvivaxSpz.txt') %>% 
  select(-c(source_id, X))

colnames(PvSpz) <- gsub('(.+sporozoites\\.\\.\\.)(PvSPZ\\..+)(\\.\\.\\..+$)','\\2', colnames(PvSpz))
colnames(PvSpz)[10] <- 'PfSPZ_Pviv'

PvSpz %<>% mutate(PvSpz.mean=rowMeans(.[2:9])) %>%
  rename(Pviv_Gene.ID=Gene.ID) %>% 
  left_join(pviv_ort, by=c('Pviv_Gene.ID'='P.viv.Gene.ID')) %>%
  filter(!is.na(Gene.ID))

PvSpz %>%
  slice_sample(n=5)
##    Pviv_Gene.ID PvSPZ.Thai2 PvSPZ.Thai3 PvSPZ.Thai4 PvSPZ.Thai5 PvSPZ.Thai6
## 1 PVP01_1228200       18.27       13.19        9.33       12.15       31.03
## 2 PVP01_0706200       58.22       72.56       46.73       30.15       51.51
## 3 PVP01_1321000       14.73       54.23       19.84       10.20       15.63
## 4 PVP01_0811000        1.84        9.00        2.86        0.00        4.43
## 5 PVP01_1455100       48.66      112.23       71.38       77.86       58.77
##   PvSPZ.Thai7 PvSPZ.Thai8 PvSPZ.Thai9 PfSPZ_Pviv PvSpz.mean       Gene.ID
## 1       26.35       21.26       17.26          0   18.60500 PF3D7_1326200
## 2       82.82       69.08       73.58          0   60.58125 PF3D7_0907700
## 3       25.53       14.66       34.23          0   23.63125 PF3D7_1428300
## 4       40.14        5.22        3.43          0    8.36500 PF3D7_1010900
## 5       60.03       67.42       87.17          0   72.94000 PF3D7_1237000
##                              Product.Description gene_source_id
## 1 conserved Plasmodium protein, unknown function  PF3D7_1326200
## 2                        proteasome activator 28  PF3D7_0907700
## 3 proliferation-associated protein 2g4, putative  PF3D7_1428300
## 4 conserved Plasmodium protein, unknown function  PF3D7_1010900
## 5               SUMO-activating enzyme subunit 2  PF3D7_1237000
##   Gene.Name.or.Symbol
## 1                 N/A
## 2                PA28
## 3                 N/A
## 4                 N/A
## 5                UBA2
  • P. falciparum sporozoites
PfSpz <- read.delim('Data/PlasmoDB/Hoffman_PfalSpz.txt') %>%
  select(-c(source_id, gene_source_id, X))

colnames(PfSpz)[3:5] <- c('Pf.sgSpz1', 'Pf.sgSpz2', 'Pf.cultSpz2')

PfSpz %>%
  slice_sample(n=5)
##         Gene.ID Gene.Name.or.Symbol Pf.sgSpz1 Pf.sgSpz2 Pf.cultSpz2
## 1 PF3D7_1102900                 N/A      0.59      0.00       44.55
## 2 PF3D7_1407200                 N/A     83.00     62.62       39.85
## 3 PF3D7_0723700                 N/A      8.60     21.39      222.64
## 4 PF3D7_1118800               ARC40      0.65      0.68       35.62
## 5 PF3D7_0506100                 N/A     25.56     34.35       88.35

sporozoites. The Pf data needs to be summarized as average.

SpzPvPf <- left_join(PvSpz %>% select(c(Gene.ID, PvSpz.mean)), 
                     PfSpz, by='Gene.ID') %>%
  rowwise() %>%
  mutate(PfSpz.mean=mean(Pf.sgSpz1, Pf.sgSpz2)) %>% 
  ungroup()
## Warning in left_join(PvSpz %>% select(c(Gene.ID, PvSpz.mean)), PfSpz, by = "Gene.ID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 74 of `x` matches multiple rows in `y`.
## ℹ Row 3146 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
ct3 <- cor.test(SpzPvPf$PvSpz.mean, SpzPvPf$PfSpz.mean, method='pearson')
lr3 <- MASS::rlm(PvSpz.mean ~ PfSpz.mean, data=SpzPvPf)
res3 <- lr3$residuals

p1 <- ggplot(SpzPvPf, aes(x=PfSpz.mean, y=PvSpz.mean))+
  geom_segment(mapping=aes(x=PfSpz.mean, xend=PfSpz.mean, 
                           y=PvSpz.mean, yend=PvSpz.mean-res3), alpha=0.2)+
  geom_abline(slope=lr3$coefficients[[2]], 
              intercept=lr3$coefficients[[1]], 
              color='orange', size=1)+
  stat_cor(color='orange')+
  theme_minimal()+theme(aspect.ratio=1)

p2 <- p1+xlim(c(0,500))+ylim(c(0,1000))+
  ggtitle('Zoom in')+theme(aspect.ratio=1)

p3 <- ggplot(SpzPvPf, aes(x=PfSpz.mean, y=res3))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+theme(aspect.ratio=1)
ggtitle('Residuals')
## $title
## [1] "Residuals"
## 
## attr(,"class")
## [1] "labels"
p4 <- ggplot(SpzPvPf, aes(x=PvSpz.mean, y=res3))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  theme(aspect.ratio=1)+
  ggtitle('Residuals')

p1 + p2 + p3 + p4
## Warning: Removed 139 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 139 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Assexual blood stages: P. falciparum vs. P. berghei

We haven’t loaded this data yet.

  • P. berghei (Hoeijmaker): this dataset has synchronized enriched isolated stages that will be compared to the Pf 7 stages dataset.
pber_ort <- read.delim('Data/PlasmoDB/PberPfal_SyntenicOrthologs.txt') %>% 
  select(-c(2,4)) %>% 
  rename(Pber_Gene.ID=Input.Ortholog.s.)

pber.H <- read.delim('Data/PlasmoDB/Hoeijmaker_Kant_PbergheiBloodStages.txt') %>% 
  select(c(1, grep('Asex_sex.RNA.Seq', colnames(.))))

colnames(pber.H) <- gsub('(.+)(\\.\\.\\.unique.+)', '\\1', colnames(pber.H))

pber.H %<>% mutate(PbRing.mean=rowMeans(.[,grep('Ring', names(.))]),
                   PbTroph.mean=rowMeans(.[,grep('Trophozoite', colnames(.))]),
                   PbSchiz.mean=rowMeans(.[,grep('Schizont', colnames(.))]),
                   PbGam.mean=rowMeans(.[,grep('Gametocyte', colnames(.))])) %>%
  rename(PbOokinete=cl15cy1...Ookinete) %>% select(Gene.ID, PbOokinete, grep('.mean', colnames(.))) %>%
  rename(Pber_Gene.ID=Gene.ID) %>%
  merge(pber_ort, ., by='Pber_Gene.ID', all=FALSE)
PbPf1 <- sevenstg %>%
  inner_join(pber.H, by='Gene.ID') %>%
  rowwise() %>%
  mutate(PfTroph.mean=mean(Early.Trophozoite, Late.Trophozoite),
         PfGam.mean=mean(Gametocyte.II, Gametocyte.V)) %>%
  ungroup()

PbPf1 %>%
  slice_sample(n=5)
## # A tibble: 5 × 18
##   Gene.ID Gene.Name.or.Symbol   Ring Early.Trophozoite Late.Trophozoite Schizont
##   <chr>   <chr>                <dbl>             <dbl>            <dbl>    <dbl>
## 1 PF3D7_… N/A                  42.5              42.0             28.0      8.32
## 2 PF3D7_… N/A                  10.5              14.5              9.69     0.66
## 3 PF3D7_… N/A                   4.56              1.42             1.65     3.32
## 4 PF3D7_… N/A                 171.               72.7            168.      84.7 
## 5 PF3D7_… N/A                  23.1              11.8             47.7     29.7 
## # ℹ 12 more variables: Gametocyte.II <dbl>, Gametocyte.V <dbl>,
## #   seven.max <dbl>, seven.max.excl.gam <dbl>, Pber_Gene.ID <chr>,
## #   PbOokinete <dbl>, PbRing.mean <dbl>, PbTroph.mean <dbl>,
## #   PbSchiz.mean <dbl>, PbGam.mean <dbl>, PfTroph.mean <dbl>, PfGam.mean <dbl>
Ring
ct4ring <- cor.test(PbPf1$PbRing.mean, PbPf1$Ring, method='pearson')
lr4ring <- MASS::rlm(PbRing.mean ~ Ring, data=PbPf1)
res4ring <- lr4ring$residuals

p1 <- ggplot(PbPf1, aes(x=Ring, y=PbRing.mean))+
  geom_segment(mapping=aes(x=Ring, xend=Ring, y=PbRing.mean, yend=PbRing.mean-res4ring), 
               alpha=0.2)+
  geom_abline(slope=lr4ring$coefficients[[2]], 
              intercept=lr4ring$coefficients[[1]], color='orange', size=1)+
  stat_cor(color='orange')+
  xlab(expression(italic('P. falciparum')))+
  ylab(expression(italic('P.berghei')))+
  theme_minimal()+theme(aspect.ratio=1)

p2 <- p1+xlim(c(0,500))+ylim(c(0,1000))+
  theme(aspect.ratio=1)
ggtitle('Zoom in')
## $title
## [1] "Zoom in"
## 
## attr(,"class")
## [1] "labels"
p3 <- ggplot(PbPf1, aes(x=Ring, y=res4ring))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

p4 <- ggplot(PbPf1, aes(x=PbRing.mean, y=res4ring))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

(p1+ggtitle('Ring'))+p2+p3+p4
## Warning: Removed 174 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 174 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#####Schizont

ct4schiz <- cor.test(PbPf1$PbSchiz.mean, PbPf1$Schizont, method='pearson')
lr4schiz <- MASS::rlm(PbSchiz.mean ~ Schizont, data=PbPf1)
res4schiz <- lr4schiz$residuals

p1 <- ggplot(PbPf1, aes(x=Schizont, y=PbSchiz.mean))+
  geom_segment(mapping=aes(x=Schizont, xend=Schizont, y=PbSchiz.mean, yend=PbSchiz.mean-res4schiz), alpha=0.2)+
  geom_abline(slope=lr4schiz$coefficients[[2]], 
              intercept=lr4schiz$coefficients[[1]], color='orange', 
              size=1)+
  stat_cor(color='orange')+
  xlab(expression(italic('P. falciparum')))+
  ylab(expression(italic('P.berghei')))+
  theme_minimal()

p2 <- p1+xlim(c(0,500))+ylim(c(0,1000))+
  ggtitle('Zoom in')

p3 <- ggplot(PbPf1, aes(x=Schizont, y=res4schiz))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

p4 <- ggplot(PbPf1, aes(x=PbSchiz.mean, y=res4schiz))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

(p1+ggtitle('Schizont'))+p2+p3+p4
## Warning: Removed 172 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 172 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Trophozoite
ct4troph <- cor.test(PbPf1$PbTroph.mean, PbPf1$PfTroph.mean, method='pearson')
lr4troph <- MASS::rlm(PbTroph.mean ~ PfTroph.mean, data=PbPf1, maxit=40)
res4troph <- lr4troph$residuals

p1 <- ggplot(PbPf1, aes(x=PfTroph.mean, y=PbTroph.mean))+
  geom_segment(mapping=aes(x=PfTroph.mean, xend=PfTroph.mean, 
                           y=PbTroph.mean, yend=PbTroph.mean-res4troph), 
               alpha=0.2)+
  geom_abline(slope=lr4troph$coefficients[[2]], 
              intercept=lr4troph$coefficients[[1]], 
              color='orange', size=1)+
  stat_cor(color='orange')+
  xlab(expression(italic('P. falciparum')))+ylab(expression(italic('P.berghei')))+
  theme_minimal()

p2 <- p1+xlim(c(0,500))+ylim(c(0,1000))+
  ggtitle('Zoom in')

p3 <- ggplot(PbPf1, aes(x=PfTroph.mean, y=res4troph))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

p4 <- ggplot(PbPf1, aes(x=PbTroph.mean, y=res4troph))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

(p1+ggtitle('Trophozoite')) +p2+p3+p4
## Warning: Removed 211 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 211 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Gametocyte
ct4gam <- cor.test(PbPf1$PbGam.mean, PbPf1$PfGam.mean, method='pearson')
lr4gam <- MASS::rlm(PbGam.mean ~ PfGam.mean, data=PbPf1)
res4gam <- lr4gam$residuals

p1 <- ggplot(PbPf1, aes(x=PfGam.mean, y=PbGam.mean))+
  geom_segment(mapping=aes(x=PfGam.mean, xend=PfGam.mean, 
                           y=PbGam.mean, yend=PbGam.mean-res4gam), 
               alpha=0.2)+
  geom_abline(slope=lr4gam$coefficients[[2]], 
              intercept=lr4gam$coefficients[[1]], 
              color='orange', size=1)+
  stat_cor(color='orange')+
  xlab(expression(italic('P. falciparum')))+ylab(expression(italic('P.berghei')))+
  theme_minimal()

p2 <- p1+xlim(c(0,500))+ylim(c(0,1000))+
  ggtitle('Zoom in')

p3 <- ggplot(PbPf1, aes(x=PfGam.mean, y=res4gam))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

p4 <- ggplot(PbPf1, aes(x=PbGam.mean, y=res4gam))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

(p1+ggtitle('Gametocytes')) +p2+p3+p4
## Warning: Removed 413 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 413 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

IEDC: P. falciparum (Bartfai) vs. P.berghei (Kant)

Correlation between the Bartfai and the Kant (Pb) datasets. The Pb data hasn’t been loaded yet. Since this is a time series dataset, it needs to be summarized as the mean expression across timepoints for each gene.

pber.K <- read.delim('Data/PlasmoDB/Hoeijmaker_Kant_PbergheiBloodStages.txt') %>% 
  select(c(1, grep('Control.Asexual', colnames(.))))

colnames(pber.K) <- gsub('(.+Control\\.Asexual\\.)([[:digit:]]+)(hr.+)', 'PbT\\2', 
                         colnames(pber.K))

pber.K %<>% rowwise() %>% 
  mutate(Pber.Bmean=mean(c_across(where(is.numeric))))

pber.K %<>% rename(Pber_Gene.ID=Gene.ID) %>% 
  merge(pber_ort, ., by='Pber_Gene.ID', all=FALSE)

pber.K %>% 
  slice_sample(n=5)
##     Pber_Gene.ID       Gene.ID   PbT0   PbT6  PbT12  PbT18  PbT24  PbT30
## 1 PBANKA_0414400 PF3D7_0316600 153.54 132.65 261.54 356.70 196.71 249.16
## 2 PBANKA_1305500 PF3D7_1441600  20.61  20.57   8.99  23.24  10.73   6.06
## 3 PBANKA_1110100 PF3D7_0510500  69.96  63.59  67.42  60.58  56.59  38.82
## 4 PBANKA_1361900 PF3D7_1349100  35.22  35.60  38.04  90.31  38.73  65.99
## 5 PBANKA_1116700 PF3D7_0617100  67.47  86.20  75.81  66.23  90.31  69.06
##   Pber.Bmean
## 1  225.05000
## 2   15.03333
## 3   59.49333
## 4   50.64833
## 5   75.84667
PbPf2 <- inner_join(idc.B %>% select(c(Gene.ID, idcB.mean)),
                    pber.K %>% select(c(Gene.ID, Pber.Bmean)),
                    by='Gene.ID')
## Warning in inner_join(idc.B %>% select(c(Gene.ID, idcB.mean)), pber.K %>% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 52 of `x` matches multiple rows in `y`.
## ℹ Row 160 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
ct5 <- cor.test(PbPf2$Pber.Bmean, PbPf2$idcB.mean,
                method='pearson')
lr5 <- MASS::rlm(Pber.Bmean ~ idcB.mean,
                 data=PbPf2)

res5 <- lr5$residuals

p1 <- ggplot(PbPf2, aes(x=idcB.mean, y=Pber.Bmean))+
  geom_segment(mapping=aes(x=idcB.mean, xend=idcB.mean, 
                           y=Pber.Bmean, yend=Pber.Bmean-res5), 
               alpha=0.2)+
  geom_abline(slope=lr5$coefficients[[2]], 
              intercept=lr5$coefficients[[1]], 
              color='orange',size=1)+
  stat_cor(color='orange')+
  xlab(expression(paste(italic('P. falciparum'), ' (Toenhake 2018)')))+
  ylab(expression(italic('P.berghei')))+
  theme_minimal()

p2 <- p1+xlim(c(0,500))+ylim(c(0,1000))+
  ggtitle('Zoom in')

p3 <- ggplot(PbPf2, aes(x=idcB.mean, y=res5))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

p4 <- ggplot(PbPf2, aes(x=Pber.Bmean, y=res5))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

(p1+ggtitle('IDC time series')) +p2+p3+p4
## Warning: Removed 216 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 216 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

IEDC: P. falciparum (Hoeijmaker) vs. P.berghei (Kant)

idc.H <- read.xlsx('Data/PlasmoDB/Hoeijmaker_PfIDCtimepoints.xlsx')

colnames(idc.H) <- gsub('(sense\\.-\\.)(.+)(\\.hours.+)', 't\\2hpi', colnames(idc.H))

idc.H %<>% select(-c(2, 12))

colnames(idc.H) <- gsub(' ', '\\.', colnames(idc.H))

idc.H %<>% rowwise() %>% 
  mutate(idcH.max=max(c_across(where(is.numeric))),
         idcH.mean=mean(c_across(where(is.numeric))),
         idcH.cumul=sum(c_across(ends_with('hpi'))) 
         )

idc.H %>%
  slice_sample(n=5)
## # A tibble: 5,548 × 13
## # Rowwise: 
##    Gene.ID  `t40-5hpi` `t2-10hpi` `t7-15hpi` `t12-20hpi` `t17-25hpi` `t22-30hpi`
##    <chr>         <dbl>      <dbl>      <dbl>       <dbl>       <dbl>       <dbl>
##  1 PF3D7_0…      11.1        2.88       1.53        2.17        5.15        4.26
##  2 PF3D7_0…       5.64       4.61       5.58        4.03        2.18        2.08
##  3 PF3D7_0…       0.07       0.14       0.11        0.1         0.13        0.17
##  4 PF3D7_0…       0.4        0.25       0.29        0.11        0.06        0.18
##  5 PF3D7_0…       0.5        0          0           0           0           0   
##  6 PF3D7_0…       0.05       0.11       0           0.08        0           0   
##  7 PF3D7_0…       0          0          0           0           0           0   
##  8 PF3D7_0…       0          0          0           0           0.06        0.06
##  9 PF3D7_0…       0.05       0.16       0.12        0.12        0.06        0.38
## 10 PF3D7_0…       0.05       0.16       0.24        0.08        0.18        0.06
## # ℹ 5,538 more rows
## # ℹ 6 more variables: `t27-35hpi` <dbl>, `t32-40hpi` <dbl>,
## #   Gene.Name.or.Symbol <chr>, idcH.max <dbl>, idcH.mean <dbl>,
## #   idcH.cumul <dbl>
PbPf3 <- inner_join(idc.H[,c('Gene.ID','idcH.mean')], 
                    pber.K[,c('Gene.ID','Pber.Bmean')], 
                    by='Gene.ID')
## Warning in inner_join(idc.H[, c("Gene.ID", "idcH.mean")], pber.K[, c("Gene.ID", : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 52 of `x` matches multiple rows in `y`.
## ℹ Row 160 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
ct6 <- cor.test(PbPf3$Pber.Bmean, 
                PbPf3$idcH.mean, 
                method='pearson')

lr6 <- MASS::rlm(Pber.Bmean ~ idcH.mean, data=PbPf3)
res6 <- lr6$residuals

p1 <- ggplot(PbPf3, aes(x=idcH.mean, xend=idcH.mean, 
                        y=Pber.Bmean, yend=Pber.Bmean-res6))+
  geom_segment(alpha=0.2)+
  geom_abline(slope=lr6$coefficients[[2]], 
              intercept=lr6$coefficients[[1]], 
              color='orange',size=1)+
  stat_cor(color='orange')+
  xlab(expression(paste(italic('P. falciparum'), ' (Hoeijmaker)')))+
  ylab(expression(italic('P.berghei')))+
  theme_minimal()

p2 <- p1+xlim(c(0,500))+ylim(c(0,1000))+
  ggtitle('Zoom in')

p3 <- ggplot(PbPf3, aes(x=idcH.mean, y=res6))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

p4 <- ggplot(PbPf3, aes(x=Pber.Bmean, y=res6))+
  geom_point(color='firebrick', size=0.3)+
  geom_smooth(color='darkorchid4',fill='darkorchid4')+
  theme_minimal()+
  ggtitle('Residuals')

(p1+ggtitle('IDC time series'))+p2+p3+p4
## Warning: The following aesthetics were dropped during statistical transformation: xend
## and yend.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 231 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: The following aesthetics were dropped during statistical transformation: xend
## and yend.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 231 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Consolidate residuals

  • In order to calculate the F3 of the score, we need to consolidate the residuals for each correlation.
  • The residual is transformed as its fraction over the theoretical value according to the rlm for that gene (because of the observation made from the plots above that the size of the residuals increases with the value of gene expression).
    -The final calculated metric R is the sum of all the residuals for each gene.
r1 <- data.frame('Gene.ID'=PcynPfal$Gene.ID,
                 'XbPc'=PcynPfal$PcynB.mean,
                 'XbPf'=PcynPfal$idcB.mean,
                 'ResidualPcBlood'=res1) %>%
  rowwise() %>% 
  mutate(RbPc=ResidualPcBlood/(lr1$coefficients[[1]]+lr1$coefficients[[2]]*XbPf)) 

r2 <- data.frame('Gene.ID'=PvivPfal$Gene.ID,
                 'XbPv'=PvivPfal$Pviv.Bmean,
                 'XbPf'=PvivPfal$idcB.mean,
                 'ResidualPvBlood'=res2) %>%
  rowwise() %>% mutate(RbPv=ResidualPvBlood/(lr2$coefficients[[1]]+lr3$coefficients[[2]]*XbPf))

r3 <- data.frame('Gene.ID'=SpzPvPf$Gene.ID,
                 'XsPf'=SpzPvPf$Pf.sgSpz1,
                 'XsPv'=SpzPvPf$PvSpz.mean,
                 'ResidualPvSpz'=res3) %>%
  rowwise() %>% mutate(RsPv=ResidualPvSpz/(lr3$coefficients[[1]]+lr3$coefficients[[2]]*XsPf))

r4 <- data.frame('Gene.ID'=PbPf1$Gene.ID,
                 'XrPb'=PbPf1$PbRing.mean,
                 'XrPf'=PbPf1$Ring,
                 'ResidualPbPfRing'=res4ring,
                 'XscPb'=PbPf1$PbSchiz.mean,
                 'XscPf'=PbPf1$Schizont,
                 'ResidualPbPfSchizont'=res4schiz,
                 'XtPb'=PbPf1$PbTroph.mean,
                 'XtPf'=PbPf1$PfTroph.mean,
                 'ResidualPbPfTroph'=res4troph,
                 'XgPb'=PbPf1$PbGam.mean,
                 'XgPf'=PbPf1$PfGam.mean,
                 'ResidualPbPfGam'=res4gam) %>%
  rowwise() %>% mutate(RrPb=ResidualPbPfRing/(lr4ring$coefficients[[1]]+lr4ring$coefficients[[2]]*XrPf),
                       RscPb=ResidualPbPfSchizont/(lr4schiz$coefficients[[1]]+lr4schiz$coefficients[[2]]*XscPf),
                       RtPb=ResidualPbPfTroph/(lr4troph$coefficients[[1]]+lr4troph$coefficients[[2]]*XtPf),
                       RgPb=ResidualPbPfGam/(lr4gam$coefficients[[1]]+lr4gam$coefficients[[2]]*XgPf))

r5 <- data.frame('Gene.ID'=PbPf2$Gene.ID,
                 'XbPb'=PbPf2$Pber.Bmean,
                 'XbPf'=PbPf2$idcB.mean,
                 'ResidualPbPfBlood'=res5) %>%
  rowwise() %>% mutate(RbPb.B=ResidualPbPfBlood/(lr5$coefficients[[1]]+lr5$coefficients[[2]]*XbPf)) 

r6 <- data.frame('Gene.ID'=PbPf3$Gene.ID,
                 'XbPb'=PbPf3$Pber.Bmean,
                 'XbPf'=PbPf3$idcH.mean,
                 'ResidualPbPfBlood'=res6) %>%
  rowwise() %>% mutate(RbPb.H=ResidualPbPfBlood/(lr6$coefficients[[1]]+lr6$coefficients[[2]]*XbPf))

Res <- merge(r1[,c('Gene.ID','RbPc')], 
             r2[,c('Gene.ID','RbPv')], by='Gene.ID') %>% 
  merge(., r3[, c('Gene.ID','RsPv')], by='Gene.ID') %>%
  merge(., r4[, c('Gene.ID', 'RrPb','RscPb','RtPb','RgPb')], by='Gene.ID') %>%
  merge(., r5[,c('Gene.ID','RbPb.B')], by='Gene.ID') %>%
  merge(., r6[,c('Gene.ID','RbPb.H')], by='Gene.ID') %>%
  distinct(.keep_all=TRUE)  


Res %<>% rowwise() %>% mutate(R=sum(abs(c_across(where(is.numeric)))))
head(Res)
## # A tibble: 6 × 11
## # Rowwise: 
##   Gene.ID      RbPc   RbPv    RsPv    RrPb   RscPb   RtPb    RgPb  RbPb.B RbPb.H
##   <chr>       <dbl>  <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>  <dbl>
## 1 mal_mito_2 -0.990  1.05  -0.164  -0.899  -0.889  -0.892 -0.911   0.284   0.822
## 2 mal_mito_3 -0.987  0.415 -0.604  -0.0857  2.67    0.922 -0.111   0.474   1.36 
## 3 PF3D7_010…  0.162  0.227 -0.402   1.36   -0.311   0.682  1.16   -0.0827  0.295
## 4 PF3D7_010… -0.694 -0.259 -0.0315 -0.579   0.0688 -0.863  0.0888 -0.141  -0.300
## 5 PF3D7_010… -0.816 -0.169 -0.780  -0.229  -0.0503 -0.142 -0.206   0.746   0.334
## 6 PF3D7_010…  0.360 -0.552  1.51   -0.449  -0.729  -0.198 -0.835  -0.704  -0.519
## # ℹ 1 more variable: R <dbl>

Calculate the scores

scoring <- LB.ranks %>% select(c(Gene.ID, PcLr, PcRankDiff, PvLr, PvRankDiff)) %>%
  inner_join(Res %>% select(c(Gene.ID, R)), 
             by='Gene.ID') %>% 
  inner_join(idc.B %>% select(Gene.ID, Gene.Name.or.Symbol, idcB.cumul), 
             by='Gene.ID') %>% 
  rowwise() %>% 
  mutate(logCumul=log10(idcB.cumul))
## Warning in inner_join(., idc.B %>% select(Gene.ID, Gene.Name.or.Symbol, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 147 of `x` matches multiple rows in `y`.
## ℹ Row 82 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
maxCumul = max(scoring$logCumul)
minCumul = min(scoring$logCumul)
scoring %<>% rowwise() %>%
  mutate(mu=(logCumul-minCumul)/(maxCumul-minCumul))


scoring %<>%  rowwise() %>% 
  mutate( F1= ( PcLr*abs(PcRankDiff) ) + ( PvLr*abs(PvRankDiff) ),
          F3= 1/(abs(R))) %>%
  mutate(F2=ifelse(F1>0, 1-mu, ifelse(F1<0, mu, 0))) %>%
  mutate(Score=F1*F2*F3) %>% 
  distinct() %>%
  ungroup()

scoring %>%
  select(Gene.ID, Gene.Name.or.Symbol, F1, F2, F3, Score) %>%
  arrange(-Score)
## # A tibble: 3,848 × 6
##    Gene.ID       Gene.Name.or.Symbol     F1    F2     F3 Score
##    <chr>         <chr>                <dbl> <dbl>  <dbl> <dbl>
##  1 PF3D7_0602300 PALM                41934. 0.739 0.170  5277.
##  2 PF3D7_1418100 LISP1               48421. 0.861 0.0872 3635.
##  3 PF3D7_0615100 ENR                 28416. 0.737 0.165  3449.
##  4 PF3D7_0211400 KASIII              37814. 0.682 0.125  3226.
##  5 PF3D7_1201700 N/A                 40725. 0.549 0.127  2835.
##  6 PF3D7_1446400 pdhB                35725. 0.604 0.129  2792.
##  7 PF3D7_0922900 FabG                32765. 0.673 0.125  2758.
##  8 PF3D7_1246100 ALAS                29526. 0.653 0.129  2488.
##  9 PF3D7_1312000 MCAT                18860. 0.714 0.160  2160.
## 10 PF3D7_1368000 N/A                 15757. 0.599 0.226  2134.
## # ℹ 3,838 more rows

Visualize

scoring %<>% 
  rowwise() %>%
  mutate(Gene.label=ifelse(Gene.Name.or.Symbol!='N/A', Gene.Name.or.Symbol, Gene.ID))
  • How does expression look vs new and old score?
p1 <- scoring %>% left_join(LB.ranks, by='Gene.ID') %>%
  ggplot(aes(x=PvBloodRank, y=PvLiverRank, color=Score))+
  geom_point(alpha=0.7, size=0.3)+
  labs(x='Blood', y='Liver')+
  ggtitle(expression(italic('P. vivax')))+
  scale_color_viridis_c(option='inferno')+
  theme_classic()+theme(aspect.ratio=1)

p2 <- scoring %>% left_join(LB.ranks, by='Gene.ID') %>%
  ggplot(aes(x=PcBloodRank, y=PcLiverRank, color=Score))+
  geom_point(alpha=0.7, size=0.3)+
  labs(x='Blood', y='Liver')+
  ggtitle(expression(italic('P. cynomolgi')))+
  scale_color_viridis_c(option='inferno')+
  theme_classic()+theme(aspect.ratio=1)

(p1|p2)&plot_annotation(title='Rank of expression')

scoring %>% 
  left_join(idc.B, by='Gene.ID') %>%
  select(c(Gene.ID, Score, starts_with('T'))) %>%
  pivot_longer(cols=starts_with('T'), 
               names_to='Time',
               values_to='Expression') %>%
  mutate(Time=as.numeric(gsub('T', '', Time))) %>%
  arrange(-Score) %>%
  ggplot(aes(x=Time, y=Expression, color=Score, 
             group=Gene.ID))+
  geom_line(mapping=aes(order=Score))+
  scale_color_viridis_c(option='inferno')+
  scale_y_log10(oob=oob_squish)+annotation_logticks(outside=TRUE, sides='l')+
  scale_x_continuous(expand=expansion(mult=0))+
  theme_classic()
## Warning in geom_line(mapping = aes(order = Score)): Ignoring unknown
## aesthetics: order
## Warning in scale_y_log10(oob = oob_squish): log-10 transformation introduced
## infinite values.

Save results

write.table(scoring, 
            file='GeneExpressionScores.txt', 
            sep='\t', quote=FALSE, row.names=FALSE)